title: “Chen lab RNA-seq version 3 (S449 Samples redone by Novogene again)” output: html_notebook date: April 5, 2019 #human cell lines # si RNA knockdown experiment
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.5.2
library(edgeR)
## Warning: package 'edgeR' was built under R version 3.5.2
## Loading required package: limma
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 3.5.2
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
## Warning: package 'BiocParallel' was built under R version 3.5.2
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply
library(biomaRt)
library(ggplot2)
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggplot2_3.1.0 biomaRt_2.38.0
## [3] DESeq2_1.22.2 SummarizedExperiment_1.12.0
## [5] DelayedArray_0.8.0 BiocParallel_1.16.5
## [7] matrixStats_0.54.0 Biobase_2.42.0
## [9] GenomicRanges_1.34.0 GenomeInfoDb_1.18.1
## [11] IRanges_2.16.0 S4Vectors_0.20.1
## [13] BiocGenerics_0.28.0 edgeR_3.24.3
## [15] limma_3.38.3 pheatmap_1.0.12
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.0 bit64_0.9-7 splines_3.5.1
## [4] Formula_1.2-3 assertthat_0.2.0 latticeExtra_0.6-28
## [7] blob_1.1.1 GenomeInfoDbData_1.2.0 progress_1.2.0
## [10] yaml_2.2.0 RSQLite_2.1.1 pillar_1.3.1
## [13] backports_1.1.3 lattice_0.20-38 glue_1.3.0
## [16] digest_0.6.18 RColorBrewer_1.1-2 XVector_0.22.0
## [19] checkmate_1.9.0 colorspace_1.3-2 htmltools_0.3.6
## [22] Matrix_1.2-15 plyr_1.8.4 XML_3.98-1.16
## [25] pkgconfig_2.0.2 genefilter_1.64.0 zlibbioc_1.28.0
## [28] xtable_1.8-3 purrr_0.2.5 scales_1.0.0
## [31] tibble_2.0.0 htmlTable_1.13.1 annotate_1.60.0
## [34] withr_2.1.2 nnet_7.3-12 lazyeval_0.2.1
## [37] survival_2.43-3 magrittr_1.5 crayon_1.3.4
## [40] memoise_1.1.0 evaluate_0.12 foreign_0.8-71
## [43] prettyunits_1.0.2 tools_3.5.1 data.table_1.11.8
## [46] hms_0.4.2 stringr_1.3.1 munsell_0.5.0
## [49] locfit_1.5-9.1 cluster_2.0.7-1 AnnotationDbi_1.44.0
## [52] bindrcpp_0.2.2 compiler_3.5.1 rlang_0.3.1
## [55] grid_3.5.1 RCurl_1.95-4.11 rstudioapi_0.9.0
## [58] htmlwidgets_1.3 bitops_1.0-6 base64enc_0.1-3
## [61] rmarkdown_1.11 gtable_0.2.0 DBI_1.0.0
## [64] R6_2.3.0 gridExtra_2.3 knitr_1.21
## [67] dplyr_0.7.8 bit_1.1-14 bindr_0.1.1
## [70] Hmisc_4.1-1 stringi_1.2.4 Rcpp_1.0.0
## [73] geneplotter_1.60.0 rpart_4.1-13 acepack_1.4.1
## [76] tidyselect_0.2.5 xfun_0.4
Read in files. Add new samples that are technical replicates (S449).
filename = "/Users/tfriedrich/Downloads/readcounts.txt"
counts1_old = read.table(filename, header = T,stringsAsFactors = F)
rownames(counts1_old) = counts1_old$Geneid
counts1_gene_annotation = (counts1_old[,1:6])
counts1_old = data.matrix(counts1_old[,7:ncol(counts1_old)])
colnames(counts1_old) = sub("_1_algn_Aligned", "", matrix(unlist(strsplit(colnames(counts1_old), "\\.")), byrow =T, ncol=6)[,3])
filename2 = "/Users/tfriedrich/Downloads/readcounts_newsamples_version3.txt"
counts1_new = read.table(filename2, header=T, stringsAsFactors = F)
rownames(counts1_new) = counts1_new$Geneid
counts1_new = data.matrix(counts1_new[,7:ncol(counts1_new)])
colnames(counts1_new) = sub("_1_algn_Aligned", "", matrix(unlist(strsplit(colnames(counts1_new), "\\.")), byrow =T, ncol=7)[,4])
length(which(rownames(counts1_old)==rownames(counts1_new)))
## [1] 58302
counts1 = data.frame(cbind( counts1_old, counts1_new))
dim(counts1)
## [1] 58302 24
Filter out genes with low and high counts ( high counts might be mitochondrial genes)
keep = rowMeans(counts1) > 5 & rowMeans(counts1) < 5000
counts2 = counts1[keep,]
Take the log (RPKM)
y <- DGEList(counts=counts2)
fpkm_log_matrix = rpkm (y, gene.length=counts1_gene_annotation[keep,"Length"], log = TRUE)
Dendrogram
hc2 <- hclust(stats::dist(t(fpkm_log_matrix), method="minkowski"), "ward.D2")
plot(hc2, lwd=4, lty=1, col="black", col.lab="red", xlab="Samples", main="Samples clustered by log2(FPKM+1) using Ward's clustering & Euclidean distance")
Some replicates don’t cluster (see FocusT replicates). Samples cluster by cell type. SS49 and SS475 are more similar than the other cell types.
S449 technical replicate showing batch effects.
Setup up design matrix for PCA plot
groupNamesPerSample = c(matrix (unlist(strsplit(colnames(counts2[,1:20]), "_")), byrow=T, ncol=4) [,1], colnames(counts2[,21:24]))
groupNamesPerSample = unlist( lapply (groupNamesPerSample, function(x) sub("S475", "", sub("S449", "", sub("PLC", "", sub("Focus", "", x))))) )
conditionFactor <- factor(groupNamesPerSample)
subjectgroup = c(matrix (unlist(strsplit(colnames(counts2[,1:20]), "_")), byrow=T, ncol=4) [,1], colnames(counts2[,21:24]))
subjectgroup = unlist( lapply (subjectgroup, function(x) sub("SC", "", sub("YT", "", sub("T", "", sub("Y", "", x))))) )
Subject <- factor(subjectgroup)
PCA plot
cold <- data.frame("conditionFactor"=conditionFactor, "Subject"=Subject)
design_matrix <- model.matrix( ~Subject + conditionFactor)
ddsMat <- DESeq2::DESeqDataSetFromMatrix(countData=counts2, colData=cold, design=~conditionFactor);
colnames(ddsMat) <- colnames(counts2)
rld <- DESeq2::rlog(ddsMat) # LOG TRANSFORMED.
pca <- prcomp(t(assay(rld)))
allPer.vec <- 100 * summary(pca)$importance[2,]
allPer.text <- paste(format(allPer.vec,digits=0, nsmall=1, scientific=FALSE), "%", sep='')
allPerLabels.vec <- paste(names(allPer.vec), "\n", allPer.text, sep='')
barplot(allPer.vec, main="PCA: % variance explained by each component\n" , ylim=c(0,100), names.arg=allPerLabels.vec)
d <- data.frame(pca$x, group=conditionFactor, Subject, name=colnames(rld)) # returns PC1, PC2, ... PC8... etc
ppp <- ggplot(d, aes_string(x="PC1", y="PC2", shape="Subject", color="group"))
ppp <- ppp + geom_point(size=6)
plot(ppp)
ppp <- ggplot(d, aes_string(x="PC1", y="PC2", shape="Subject", color="group"))
ppp <- ppp + geom_point(size=3) + geom_text(aes(label=name), nudge_y=6)
plot(ppp)
Samples seem to cluster by cell type. old S449 and S475 appear to be very similar. Replicates cluster together mostly.
Group/add up samples with technical replicates
#done manually using counts3 because I don't want to use matrix counts with 1 added to the value
counts3 = data.frame(
counts2[, "FocusSC_USR17001380L_HJHCJBBXX_L6"]
, counts2[, "FocusT_USR17001382L_HJHCJBBXX_L7"] + counts2[, "FocusT_USR17001382L_HKF5JBBXX_L1"]
, counts2[, "FocusYT_USR17001383L_hkff7BBXX_L1"] + counts2[, "FocusYT_USR17001383L_HKY5KBBXX_L1"]
, counts2[, "FocusY_USR17001381L_HJHCJBBXX_L6"]
, counts2[, "PLCSC_USR17002043L_HJHCJBBXX_L7"]
, counts2[, "PLCT_USR17002044L_HJHCJBBXX_L7"]
, counts2[, "PLCYT_USR17001379L_HJHCJBBXX_L6"]
, counts2[, "PLCY_USR17001377L_HJHCJBBXX_L6"]
, counts2[, "S449SC_USR17001384L_HJHCJBBXX_L7"]
, counts2[, "S449T_USR17001386L_HJHCJBBXX_L7"]
, counts2[, "S449YT_USR17002046L_HJHCJBBXX_L7"] + counts2[, "S449YT_USR17002046L_HKF5JBBXX_L3"]
, counts2[, "S449Y_USR17001385L_HJHCJBBXX_L7"] + counts2[, "S449Y_USR17001385L_HKF5JBBXX_L3"]
, counts2[, "S475SC_USR17001372L_HJHCJBBXX_L6"]
, counts2[, "S475T_USR17001374L_HJHCJBBXX_L6"]
, counts2[, "S475YT_USR17001375L_HJHCJBBXX_L6"]
, counts2[, "S475Y_USR17001373L_HJHCJBBXX_L6"]
, counts2[, "S449SC"]
, counts2[, "S449T"]
, counts2[, "S449Y"]
, counts2[, "S449YT"]
)
colnames(counts3)
## [1] "counts2....FocusSC_USR17001380L_HJHCJBBXX_L6.."
## [2] "counts2....FocusT_USR17001382L_HJHCJBBXX_L7.....counts2....FocusT_USR17001382L_HKF5JBBXX_L1.."
## [3] "counts2....FocusYT_USR17001383L_hkff7BBXX_L1.....counts2....FocusYT_USR17001383L_HKY5KBBXX_L1.."
## [4] "counts2....FocusY_USR17001381L_HJHCJBBXX_L6.."
## [5] "counts2....PLCSC_USR17002043L_HJHCJBBXX_L7.."
## [6] "counts2....PLCT_USR17002044L_HJHCJBBXX_L7.."
## [7] "counts2....PLCYT_USR17001379L_HJHCJBBXX_L6.."
## [8] "counts2....PLCY_USR17001377L_HJHCJBBXX_L6.."
## [9] "counts2....S449SC_USR17001384L_HJHCJBBXX_L7.."
## [10] "counts2....S449T_USR17001386L_HJHCJBBXX_L7.."
## [11] "counts2....S449YT_USR17002046L_HJHCJBBXX_L7.....counts2....S449YT_USR17002046L_HKF5JBBXX_L3.."
## [12] "counts2....S449Y_USR17001385L_HJHCJBBXX_L7.....counts2....S449Y_USR17001385L_HKF5JBBXX_L3.."
## [13] "counts2....S475SC_USR17001372L_HJHCJBBXX_L6.."
## [14] "counts2....S475T_USR17001374L_HJHCJBBXX_L6.."
## [15] "counts2....S475YT_USR17001375L_HJHCJBBXX_L6.."
## [16] "counts2....S475Y_USR17001373L_HJHCJBBXX_L6.."
## [17] "counts2....S449SC.."
## [18] "counts2....S449T.."
## [19] "counts2....S449Y.."
## [20] "counts2....S449YT.."
colnames(counts3) = c(unlist(lapply (strsplit(sub("counts2....", "" ,colnames(counts3[,1:16])) , "_"), `[[`, 1)), sub("counts2.", "", colnames(counts3[, 17:20])))
colnames(counts3) = gsub("\\.", "", colnames(counts3))
rownames(counts3) = rownames(counts2)
Look at histogram of counts.
library(ggplot2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
ggplot(gather((counts3[,1:4]), cols, value), aes(x = value)) +
geom_histogram(binwidth=100) + facet_grid(.~cols)
ggplot(gather((counts3[,5:8]), cols, value), aes(x = value)) +
geom_histogram(binwidth=100) + facet_grid(.~cols)
ggplot(gather((counts3[,9:12]), cols, value), aes(x = value)) +
geom_histogram(binwidth=100) + facet_grid(.~cols)
ggplot(gather((counts3[,13:16]), cols, value), aes(x = value)) +
geom_histogram(binwidth=100) + facet_grid(.~cols)
ggplot(gather((counts3[,17:20]), cols, value), aes(x = value)) +
geom_histogram(binwidth=100) + facet_grid(.~cols)
Setup up design matrix for Heatmap
Condition_heatmap = unlist( lapply (colnames(counts3), function(x) sub("S475", "", sub("S449", "", sub("PLC", "", sub("Focus", "", x))))) )
Condition_heatmap <- factor(Condition_heatmap)
Subject_heatmap = unlist( lapply (colnames(counts3), function(x) sub("SC", "", sub("YT", "", sub("T", "", sub("Y", "", x))))) )
Subject_heatmap <- factor(Subject_heatmap)
cold <- data.frame("Condition_heatmap"=Condition_heatmap, "Subject_heatmap"=Subject_heatmap) #"sample_name"=colnames(counts4)
#design_matrix2 <- model.matrix( ~Subject2 + Condition)
FPKM after combinining samples that are “technical” replicates
colnames(counts3) = paste0(colnames(counts3), c(rep("_1", 16), rep("_2", 4)))
y_combined <- DGEList(counts=counts3)
fpkm_matrix = rpkm (y_combined, gene.length=counts1_gene_annotation[keep,"Length"], log = FALSE)
Heatmap to see how cells cluster by cell type or experimental condition. No replicates because they have been combined.
HEATMAP_HEIGHT <- 45 # inchescol
HEATMAP_WIDTH <- 15
annotation <- data.frame(Expt = Condition_heatmap, Celltype=Subject_heatmap)
rownames(annotation) <- colnames(counts3)
pheatmap(fpkm_matrix,
, width=HEATMAP_WIDTH, height=HEATMAP_HEIGHT
, scale="row" # actually scale each row
, clustering_distance_rows="correlation" # or: euclidean
, clustering_distance_cols="correlation"
, clustering_method="complete"
, display_numbers=FALSE
, border_color="#00000000" # note: the last two digits here are the alpha channel/ transparency
, show_rownames=FALSE, show_colnames = TRUE
, annotation = annotation
, fontsize_row=3.5)
This heatmap confirms the pattern seen in the dendrogram and PCA plot.
y_combined <- DGEList(counts=counts3)
fpkm_matrix = rpkm (y_combined, gene.length=counts1_gene_annotation[keep,"Length"], log = FALSE)
#wwtr1
fpkm_matrix_wwtr1 = fpkm_matrix[which(rownames(fpkm_matrix) == "ENSG00000018408"), ]
fpkm_matrix_wwtr1_percent = c(cbind(
fpkm_matrix_wwtr1["FocusT_1"]/fpkm_matrix_wwtr1[ "FocusSC_1"]
, fpkm_matrix_wwtr1["PLCT_1"]/fpkm_matrix_wwtr1[ "PLCSC_1"]
, fpkm_matrix_wwtr1["S475T_1"]/fpkm_matrix_wwtr1[ "S475SC_1"]
, fpkm_matrix_wwtr1["S449T_1"]/fpkm_matrix_wwtr1[ "S449SC_1"]
, fpkm_matrix_wwtr1["S449T_2"]/fpkm_matrix_wwtr1[ "S449SC_2"]
, fpkm_matrix_wwtr1["FocusYT_1"]/fpkm_matrix_wwtr1[ "FocusSC_1"]
, fpkm_matrix_wwtr1["PLCYT_1"]/fpkm_matrix_wwtr1[ "PLCSC_1"]
, fpkm_matrix_wwtr1["S475YT_1"]/fpkm_matrix_wwtr1[ "S475SC_1"]
, fpkm_matrix_wwtr1["S449YT_1"]/fpkm_matrix_wwtr1[ "S449SC_1"]
, fpkm_matrix_wwtr1["S449YT_2"]/fpkm_matrix_wwtr1[ "S449SC_2"] ))
names(fpkm_matrix_wwtr1_percent ) = c("FocusT/SC", "PLCT/SC", "S475T/SC", "S449T/SC", "S449T_2/SC_2", "FocusYT/SC", "PLCYT/SC", "S475YT/SC", "S449YT/SC", "S449YT_2/SC_2")
#barplot(fpkm_matrix_wwtr1, las=2)
barplot(fpkm_matrix_wwtr1_percent, las=2, cex.names=.7, ylab= "percent KD", main="wwtr1 based on fpkm values")
#yap1
fpkm_matrix_yap1 = fpkm_matrix[which(rownames(fpkm_matrix) == "ENSG00000137693"), ]
fpkm_matrix_yap1_percent = c(cbind(
fpkm_matrix_yap1["FocusY_1"]/fpkm_matrix_yap1[ "FocusSC_1"]
, fpkm_matrix_yap1["PLCY_1"]/fpkm_matrix_yap1[ "PLCSC_1"]
, fpkm_matrix_yap1["S475Y_1"]/fpkm_matrix_yap1[ "S475SC_1"]
, fpkm_matrix_yap1["S449Y_1"]/fpkm_matrix_yap1[ "S449SC_1"]
, fpkm_matrix_yap1["S449Y_2"]/fpkm_matrix_yap1[ "S449SC_2"]
, fpkm_matrix_yap1["FocusYT_1"]/fpkm_matrix_yap1[ "FocusSC_1"]
, fpkm_matrix_yap1["PLCYT_1"]/fpkm_matrix_yap1[ "PLCSC_1"]
, fpkm_matrix_yap1["S475YT_1"]/fpkm_matrix_yap1[ "S475SC_1"]
, fpkm_matrix_yap1["S449YT_1"]/fpkm_matrix_yap1[ "S449SC_1"]
, fpkm_matrix_yap1["S449YT_2"]/fpkm_matrix_yap1[ "S449SC_2"] ))
names(fpkm_matrix_yap1_percent ) = c("FocusY/SC", "PLCY/SC", "S475Y/SC", "S449Y/SC", "S449Y_2/SC_2", "FocusYT/SC", "PLCYT/SC", "S475YT/SC", "S449YT/SC", "S449YT_2/SC_2")
#barplot(fpkm_matrix_yap1, las=2)
barplot(fpkm_matrix_yap1_percent, las=2, cex.names=.7, ylab= "percent KD", main = "yap1 based on fpkm values")
S449Y_2 appears to have worked! Unfortunately, S447T_2 appears to have failed. It will be more difficult to justify removing samples that didn’t knock down its target.
Remove old S449 samples from matrix before fitting a model. Fit the model and estimate dispersion value.
counts4 = counts3[, c("FocusSC_1", "FocusT_1", "FocusYT_1", "FocusY_1", "PLCSC_1", "PLCT_1", "PLCYT_1", "PLCY_1", "S475SC_1", "S475T_1", "S475YT_1", "S475Y_1", "S449SC_2", "S449T_2", "S449Y_2", "S449YT_2" )]
Setup up design matrix for Heatmap
Condition_DE_ = unlist( lapply (matrix(unlist(strsplit(colnames(counts4), "_")), byrow=T, ncol=2)[,1], function(x) sub("S475", "", sub("S449", "", sub("PLC", "", sub("Focus", "", x))))) )
Condition_DE_ <- factor(Condition_DE_)
Subject_DE_ = unlist( lapply (matrix(unlist(strsplit(colnames(counts4), "_")), byrow=T, ncol=2)[,1], function(x) sub("SC", "", sub("YT", "", sub("T", "", sub("Y", "", x))))) )
Subject_DE_ <- factor(Subject_DE_)
design_matrix_DE <- model.matrix( ~Subject_DE_ + Condition_DE_)
colnames(design_matrix_DE)
## [1] "(Intercept)" "Subject_DE_PLC" "Subject_DE_S449" "Subject_DE_S475"
## [5] "Condition_DE_T" "Condition_DE_Y" "Condition_DE_YT"
y_for_DE <- DGEList(counts=counts4)
y_for_DE <- calcNormFactors(y_for_DE)
y_for_DE = estimateDisp(y_for_DE, design=design_matrix_DE)
plotBCV(y_for_DE)
fit <- glmFit(y_for_DE, design=design_matrix_DE, dispersion=y_for_DE$trended.dispersion)
Run differential expression SC vs YT
lrt_T <- glmLRT(fit,coef = "Condition_DE_T")
topTags(lrt_T)
## Coefficient: Condition_DE_T
## logFC logCPM LR PValue FDR
## ENSG00000099974 -2.289454 5.864982 54.14714 1.860245e-13 3.982041e-09
## ENSG00000099977 -2.234506 5.895525 51.98310 5.598031e-13 5.991573e-09
## ENSG00000241313 -2.271751 3.424852 36.54100 1.494884e-09 1.066650e-05
## ENSG00000212724 -2.110529 5.695629 31.15033 2.387994e-08 1.277935e-04
## ENSG00000159055 -1.797491 4.440277 29.81984 4.741147e-08 2.029780e-04
## ENSG00000108821 1.611116 6.720624 27.96323 1.236429e-07 4.411166e-04
## ENSG00000138463 -1.596078 4.540343 24.86459 6.150176e-07 1.880724e-03
## ENSG00000181773 -2.166823 1.490235 21.50004 3.538213e-06 9.467374e-03
## ENSG00000189068 -2.761361 1.025508 21.25495 4.020700e-06 9.563012e-03
## ENSG00000150867 1.337408 6.370193 20.95854 4.693304e-06 1.004649e-02
smoothScatter(x=lrt_T$table$logFC, y=-log10(lrt_T$table$PValue))
lrt_T_table = data.frame(lrt_T$table, p.adjust(lrt_T$table$PValue, method = "BH"))
colnames(lrt_T_table) = c(colnames(lrt_T$table), "adjPVal_BH")
hist(lrt_T_table$PValue)
SC vs Y
lrt_Y <- glmLRT(fit,coef = "Condition_DE_Y")
topTags(lrt_Y)
## Coefficient: Condition_DE_Y
## logFC logCPM LR PValue FDR
## ENSG00000137693 -3.427520 6.901856 113.67506 1.535119e-26 3.286075e-22
## ENSG00000254422 -3.857873 4.607185 112.14306 3.324356e-26 3.558058e-22
## ENSG00000128564 3.274088 5.202611 86.56695 1.350859e-20 9.638832e-17
## ENSG00000267881 -6.965713 4.252158 69.53243 7.516840e-17 4.022637e-13
## ENSG00000275266 5.996143 4.363370 66.99688 2.719377e-16 1.164220e-12
## ENSG00000280390 -6.141730 4.886014 65.63278 5.432750e-16 1.938224e-12
## ENSG00000086548 -5.532175 5.913691 64.79711 8.302091e-16 2.538780e-12
## ENSG00000284010 5.897919 3.663154 54.15954 1.848543e-13 4.792689e-10
## ENSG00000162704 -2.194228 7.198675 53.99007 2.015052e-13 4.792689e-10
## ENSG00000157617 -2.071285 6.297929 47.53385 5.406245e-12 1.157261e-08
smoothScatter(x=lrt_Y$table$logFC, y=-log10(lrt_Y$table$PValue))
lrt_Y_table = data.frame(lrt_Y$table, p.adjust(lrt_Y$table$PValue, method = "BH"))
colnames(lrt_Y_table) = c(colnames(lrt_Y$table), "adjPVal_BH")
hist(lrt_Y_table$PValue)
SC vs YT
lrt_YT <- glmLRT(fit,coef = "Condition_DE_YT")
topTags(lrt_YT)
## Coefficient: Condition_DE_YT
## logFC logCPM LR PValue FDR
## ENSG00000137693 -3.424404 6.901856 112.79200 2.396404e-26 5.129742e-22
## ENSG00000212724 -4.123537 5.695629 104.47966 1.588158e-24 1.699805e-20
## ENSG00000254422 -3.503055 4.607185 93.56799 3.924837e-22 2.800502e-18
## ENSG00000267881 -8.711860 4.252158 84.13972 4.610157e-20 2.467125e-16
## ENSG00000128564 3.119758 5.202611 80.29383 3.226752e-19 1.381437e-15
## ENSG00000086548 -6.281735 5.913691 76.37045 2.351439e-18 8.389151e-15
## ENSG00000213417 -3.827566 4.556583 73.78900 8.692954e-18 2.658305e-14
## ENSG00000214518 -3.842997 4.370185 70.96220 3.641359e-17 9.743366e-14
## ENSG00000212725 -3.865983 4.073299 67.10244 2.577594e-16 6.130663e-13
## ENSG00000181444 2.908314 4.667671 64.75289 8.490495e-16 1.817475e-12
smoothScatter(x=lrt_YT$table$logFC, y=-log10(lrt_YT$table$PValue))
lrt_YT_table = data.frame(lrt_YT$table, p.adjust(lrt_YT$table$PValue, method = "BH"))
colnames(lrt_YT_table) = c(colnames(lrt_YT$table), "adjPVal_BH")
hist(lrt_YT_table$PValue)
Format output into excel sheet.
genes = rownames(counts4)
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
head(listAttributes(ensembl), 30)
## name
## 1 ensembl_gene_id
## 2 ensembl_gene_id_version
## 3 ensembl_transcript_id
## 4 ensembl_transcript_id_version
## 5 ensembl_peptide_id
## 6 ensembl_peptide_id_version
## 7 ensembl_exon_id
## 8 description
## 9 chromosome_name
## 10 start_position
## 11 end_position
## 12 strand
## 13 band
## 14 transcript_start
## 15 transcript_end
## 16 transcription_start_site
## 17 transcript_length
## 18 transcript_tsl
## 19 transcript_gencode_basic
## 20 transcript_appris
## 21 external_gene_name
## 22 external_gene_source
## 23 external_transcript_name
## 24 external_transcript_source_name
## 25 transcript_count
## 26 percentage_gene_gc_content
## 27 gene_biotype
## 28 transcript_biotype
## 29 source
## 30 transcript_source
## description page
## 1 Gene stable ID feature_page
## 2 Gene stable ID version feature_page
## 3 Transcript stable ID feature_page
## 4 Transcript stable ID version feature_page
## 5 Protein stable ID feature_page
## 6 Protein stable ID version feature_page
## 7 Exon stable ID feature_page
## 8 Gene description feature_page
## 9 Chromosome/scaffold name feature_page
## 10 Gene start (bp) feature_page
## 11 Gene end (bp) feature_page
## 12 Strand feature_page
## 13 Karyotype band feature_page
## 14 Transcript start (bp) feature_page
## 15 Transcript end (bp) feature_page
## 16 Transcription start site (TSS) feature_page
## 17 Transcript length (including UTRs and CDS) feature_page
## 18 Transcript support level (TSL) feature_page
## 19 GENCODE basic annotation feature_page
## 20 APPRIS annotation feature_page
## 21 Gene name feature_page
## 22 Source of gene name feature_page
## 23 Transcript name feature_page
## 24 Source of transcript name feature_page
## 25 Transcript count feature_page
## 26 Gene % GC content feature_page
## 27 Gene type feature_page
## 28 Transcript type feature_page
## 29 Source (gene) feature_page
## 30 Source (transcript) feature_page
hgnc_swissprot <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol','description'),filters = 'ensembl_gene_id', values = genes , mart = ensembl)
fpkm_log_matrix2 = rpkm (y_for_DE, gene.length=counts1_gene_annotation[keep,"Length"], log = FALSE)
index = match (genes, hgnc_swissprot$ensembl_gene_id)
results = data.frame(hgnc_swissprot[index,], counts4, fpkm_log_matrix2, lrt_T_table, lrt_Y_table, lrt_YT_table)
colnames(results) = c(
colnames(hgnc_swissprot[index,])
, unlist(lapply (colnames(counts4), function(x) paste(x, "counts", sep="_")))
, unlist(lapply (colnames(fpkm_log_matrix2), function(x) paste(x, "fpkm", sep="_")))
, unlist(lapply (colnames(lrt_T_table), function(x) paste(x, "_SC_vs_T", sep="_")))
, unlist(lapply (colnames(lrt_Y_table), function(x) paste(x, "_SC_vs_Y", sep="_")))
, unlist(lapply (colnames(lrt_YT_table), function(x) paste(x, "_SC_vs_YT", sep="_"))) )
curr_date = format(Sys.time(), "%a_%b_%d_%H_hrs_%M_min_%S_sec_%Y")
write.table(results, paste0("~/Downloads/Chen_", curr_date, ".xls"), row.names=F, col.names=T, sep="\t")
Heatmap of DE genes.
pheatmap(fpkm_log_matrix2[order(lrt_T_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusT_1", "PLCT_1", "S475T_1", "S449T_2")], cluster_rows=FALSE, cluster_cols=FALSE)
pheatmap(fpkm_log_matrix2[order(lrt_Y_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusY_1", "PLCY_1", "S475Y_1", "S449Y_2")], cluster_rows=FALSE, cluster_cols=FALSE)
pheatmap(fpkm_log_matrix2[order(lrt_YT_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],colnames(fpkm_log_matrix2)[grep("YT", colnames(fpkm_log_matrix2))] )], cluster_rows=FALSE, cluster_cols=FALSE)
#try this
mean_values = (fit$coefficients %*% t(design_matrix_DE))
colnames(mean_values) = colnames(counts4)
rownames(mean_values) = rownames(counts4)
pheatmap(mean_values[order(lrt_T_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusT_1", "PLCT_1", "S475T_1", "S449T_2")], cluster_rows=FALSE, cluster_cols=FALSE)
pheatmap(mean_values[order(lrt_Y_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusY_1", "PLCY_1", "S475Y_1", "S449Y_2")], cluster_rows=FALSE, cluster_cols=FALSE)
pheatmap(mean_values[order(lrt_YT_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],colnames(fpkm_log_matrix2)[grep("YT", colnames(fpkm_log_matrix2))] )], cluster_rows=FALSE, cluster_cols=TRUE)
#edit column description
write.table(colnames(results), "~/Downloads/column_description.txt", quote=F, col.names=F, row.names=F,sep="\t")